This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Write code to create a new data frame,called 'pf.fc_by_age_gender', that contains information on each age AND gender group.

The data frame should contain the following variables:

    mean_friend_count,
    median_friend_count,
    n (the number of users in each age and gender grouping)
pf=read.csv("pseudo_facebook.tsv",sep='\t')
suppressMessages(library(dplyr))
pf.fc_by_age_gender<-pf %>%
  group_by(age,gender) %>%
  summarise(mean_friend_count=mean(friend_count),
            median_friend_count=as.numeric(median(friend_count)),
            n=n()) %>%
  ungroup()%>%
  arrange(age)

head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## 
##   age gender mean_friend_count median_friend_count    n
## 1  13 female          259.1606               148.0  193
## 2  13   male          102.1340                55.0  291
## 3  14 female          362.4286               224.0  847
## 4  14   male          164.1456                92.5 1078
## 5  15 female          538.6813               276.0 1139
## 6  15   male          200.6658               106.5 1478
Create a line graph showing the median friend count over the ages
for each gender.
library("ggplot2")
ggplot(aes(y=median_friend_count,x=age),data=subset(pf.fc_by_age_gender,!is.na(gender)))+geom_line(aes(color=gender))

str(pf.fc_by_age_gender)
## Classes 'tbl_df', 'tbl' and 'data.frame':    274 obs. of  5 variables:
##  $ age                : int  13 13 14 14 15 15 15 16 16 17 ...
##  $ gender             : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 NA 1 2 1 ...
##  $ mean_friend_count  : num  259 102 362 164 539 ...
##  $ median_friend_count: num  148 55 224 92.5 276 ...
##  $ n                  : int  193 291 847 1078 1139 1478 1 1238 1848 1236 ...
library("reshape2")
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## 
##   age gender mean_friend_count median_friend_count    n
## 1  13 female          259.1606               148.0  193
## 2  13   male          102.1340                55.0  291
## 3  14 female          362.4286               224.0  847
## 4  14   male          164.1456                92.5 1078
## 5  15 female          538.6813               276.0 1139
## 6  15   male          200.6658               106.5 1478
pf.fc_by_age_gender.wide<-dcast(pf.fc_by_age_gender,age~gender,value.var="median_friend_count")
str(pf.fc_by_age_gender.wide)
## 'data.frame':    101 obs. of  4 variables:
##  $ age   : int  13 14 15 16 17 18 19 20 21 22 ...
##  $ female: num  148 224 276 258 246 ...
##  $ male  : num  55 92.5 106.5 136 125 ...
##  $ NA    : num  NA NA 116 NA 106 ...
head(pf.fc_by_age_gender.wide)
##   age female  male    NA
## 1  13  148.0  55.0    NA
## 2  14  224.0  92.5    NA
## 3  15  276.0 106.5 116.0
## 4  16  258.5 136.0    NA
## 5  17  245.5 125.0 106.5
## 6  18  243.0 122.0    NA
Plot the ratio of the female to male median friend counts using the data frame pf.fc_by_age_gender.wide.


Add a horizontal line to the plot with a y intercept of 1, which will be the base line
pf.fc_by_age_gender.wide$ratio=pf.fc_by_age_gender.wide$female/pf.fc_by_age_gender.wide$male
head(pf.fc_by_age_gender.wide)
##   age female  male    NA    ratio
## 1  13  148.0  55.0    NA 2.690909
## 2  14  224.0  92.5    NA 2.421622
## 3  15  276.0 106.5 116.0 2.591549
## 4  16  258.5 136.0    NA 1.900735
## 5  17  245.5 125.0 106.5 1.964000
## 6  18  243.0 122.0    NA 1.991803
ggplot(aes(x=age,y=ratio),data=pf.fc_by_age_gender.wide)+geom_line()+
  geom_hline(yintercept=1,color='red',linetype=2,alpha=0.05)

 Create a variable called year_joined in the pf data frame using the variable tenure and 2014 as the reference year.
 The variable year joined should contain the year that a user joined facebook.
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"
pf$tenure_in_years<-pf$tenure/365
head(pf)
##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes
## 1                     0     0              0            0
## 2                     0     0              0            0
## 3                     0     0              0            0
## 4                     0     0              0            0
## 5                     0     0              0            0
## 6                     0     0              0            0
##   mobile_likes_received www_likes www_likes_received tenure_in_years
## 1                     0         0                  0      0.72876712
## 2                     0         0                  0      0.01643836
## 3                     0         0                  0      0.03561644
## 4                     0         0                  0      0.25479452
## 5                     0         0                  0      0.22465753
## 6                     0         0                  0      0.04109589
pf$year_joined=floor(2014-pf$tenure_in_years)
summary(pf$year_joined)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2
table(pf$year_joined)
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70
 Create a new variable in the data frame called year_joined.bucket by using the cut function on the variable year_joined.
pf$year_joined.bucket=cut(pf$year_joined,breaks=c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)
## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] 
##        6669       15308       33366       43658
Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. This means you should have four different lines on your plot.
You should subset the data to exclude the users whose year_joined.bucket is NA.
ggplot(aes(x=age,y=friend_count),data=subset(pf,!is.na(year_joined.bucket)))+  geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)


 (1) Add another geom_line to code below  to plot the grand mean of the friend count vs age.

 (2) Exclude any users whose year_joined.bucket is NA.

 (3) Use a different line type for the grand mean.
ggplot(aes(x=age,y=friend_count),data=subset(pf,!is.na(year_joined.bucket)))+  geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)+geom_line(stat="summary",fun.y=mean,linetype=2)

str(pf)
## 'data.frame':    99003 obs. of  18 variables:
##  $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
##  $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
##  $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
##  $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
##  $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
##  $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
##  $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
##  $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tenure_in_years      : num  0.7288 0.0164 0.0356 0.2548 0.2247 ...
##  $ year_joined          : num  2013 2013 2013 2013 2013 ...
##  $ year_joined.bucket   : Factor w/ 4 levels "(2004,2009]",..: 4 4 4 4 4 4 4 4 4 4 ...
pf_req<-subset(pf,pf$tenure>1)
pf_req$friends_making_rate=pf_req$friend_count/pf_req$tenure
median(pf_req$friends_making_rate)
## [1] 0.2204301
max(pf_req$friends_making_rate)
## [1] 417
Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.
ggplot(aes(x=tenure,y=friends_making_rate),data=subset(pf_req,!is.na(year_joined.bucket)))+geom_line(aes(color=year_joined.bucket),stat='summary',fun.y=mean)

ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_smooth(aes(color = year_joined.bucket))
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

library(GGally)
## 
## Attaching package: 'GGally'
## 
## The following object is masked from 'package:dplyr':
## 
##     nasa
yo=read.csv("yogurt.csv",header=TRUE,stringsAsFactors = TRUE)
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : int  2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...
yo$id<-factor(yo$id)
summary(yo)
##       obs               id            time         strawberry     
##  Min.   :   1.0   2132290:  74   Min.   : 9662   Min.   : 0.0000  
##  1st Qu.: 696.5   2130583:  59   1st Qu.: 9843   1st Qu.: 0.0000  
##  Median :1369.5   2124073:  50   Median :10045   Median : 0.0000  
##  Mean   :1367.8   2149500:  50   Mean   :10050   Mean   : 0.6492  
##  3rd Qu.:2044.2   2101790:  47   3rd Qu.:10255   3rd Qu.: 1.0000  
##  Max.   :2743.0   2129528:  39   Max.   :10459   Max.   :11.0000  
##                   (Other):2061                                    
##    blueberry        pina.colada          plain         mixed.berry    
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 0.0000   Median : 0.0000   Median :0.0000   Median :0.0000  
##  Mean   : 0.3571   Mean   : 0.3584   Mean   :0.2176   Mean   :0.3887  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :12.0000   Max.   :10.0000   Max.   :6.0000   Max.   :8.0000  
##                                                                       
##      price      
##  Min.   :20.00  
##  1st Qu.:50.00  
##  Median :65.04  
##  Mean   :59.25  
##  3rd Qu.:68.96  
##  Max.   :68.96  
## 
ggplot(aes(x=price),data=yo)+geom_histogram(binwidth=2)

ggplot(aes(x=time,y=price),data=yo)+geom_point()

set.seed(210)
sample.ids<-sample(levels(yo$id),16)
library(ggplot2)
data(diamonds)
Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.
str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820
ggplot(aes(x=price),data=diamonds)+geom_histogram(aes(fill=cut),binwidth=0.1)+facet_wrap(~color)+scale_x_log10()+scale_fill_brewer(type='qual')

Create a scatterplot of diamond price vs. table and color the points by the cut of the diamond.
ggplot(aes(y=price,x=table),data=diamonds)+geom_point(aes(color=cut))+scale_color_brewer(type='qual')

Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by  the clarity of diamonds. Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.
diamonds<-transform(diamonds,volume=x*y*z)
diamonds_req<-subset(diamonds,(volume<=quantile(diamonds$volume,0.9)))
ggplot(aes(y=price,x=volume),data=diamonds_req)+geom_point(aes(color=clarity))+scale_y_log10()+scale_color_brewer(type = 'div')

pf=read.csv("pseudo_facebook.tsv",sep='\t')
str(pf)
## 'data.frame':    99003 obs. of  15 variables:
##  $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
##  $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
##  $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
##  $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
##  $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
##  $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
##  $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
##  $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...
head(pf)
##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes
## 1                     0     0              0            0
## 2                     0     0              0            0
## 3                     0     0              0            0
## 4                     0     0              0            0
## 5                     0     0              0            0
## 6                     0     0              0            0
##   mobile_likes_received www_likes www_likes_received
## 1                     0         0                  0
## 2                     0         0                  0
## 3                     0         0                  0
## 4                     0         0                  0
## 5                     0         0                  0
## 6                     0         0                  0
pf$prop_initiated<-ifelse(pf$friend_count>0,pf$friendships_initiated/pf$friend_count,0)
Create a line graph of the median proportion of friendships initiated ('prop_initiated') vs.
tenure and color the line segment by year_joined.bucket.
pf$tenure_in_years<-pf$tenure/365
head(pf)
##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes
## 1                     0     0              0            0
## 2                     0     0              0            0
## 3                     0     0              0            0
## 4                     0     0              0            0
## 5                     0     0              0            0
## 6                     0     0              0            0
##   mobile_likes_received www_likes www_likes_received prop_initiated
## 1                     0         0                  0              0
## 2                     0         0                  0              0
## 3                     0         0                  0              0
## 4                     0         0                  0              0
## 5                     0         0                  0              0
## 6                     0         0                  0              0
##   tenure_in_years
## 1      0.72876712
## 2      0.01643836
## 3      0.03561644
## 4      0.25479452
## 5      0.22465753
## 6      0.04109589
pf$year_joined=floor(2014-pf$tenure_in_years)
summary(pf$year_joined)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2
table(pf$year_joined)
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70
pf$year_joined.bucket=cut(pf$year_joined,breaks=c(2004,2009,2011,2012,2014))
ggplot(aes(x=tenure,y=prop_initiated),data=pf)+geom_line(stat='summary',fun.y='median',aes(color=year_joined.bucket))
## Warning: Removed 2 rows containing missing values (stat_summary).

Smooth the last plot you created of of prop_initiated vs tenure colored by year_joined.bucket. You can us e largerbins for tenure or add a smoother to the plot.
ggplot(aes(x=tenure,y=prop_initiated),data=pf)+geom_line(stat='summary',fun.y='median',aes(color=year_joined.bucket))+stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)
## Warning: Removed 2 rows containing missing values (stat_summary).
## Warning: Removed 2 rows containing missing values (stat_smooth).

Create a scatter plot of the price/carat ratio of diamonds. The variable x should be assigned to cut. The points should be colored  by diamond color, and the plot should be faceted by clarity.
str(diamonds)
## 'data.frame':    53940 obs. of  11 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
##  $ volume : num  38.2 34.5 38.1 46.7 51.9 ...
ggplot(aes(x=cut,y=price/carat),data=diamonds)+geom_jitter(aes(color=diamonds$color))+facet_wrap(~clarity)+scale_color_brewer(type = 'div')